library(tidyverse) # Colección de paquetes para ciencia de datos (incluye ggplot2, dplyr, tidyr, readr, purrr, tibble)
library(reshape) # Herramientas para reorganizar datos
library(Hmisc) # resúmenes estadísticos
library(limma) # Análisis de datos de expresión genética
library(AnnotationDbi) # Interfaz para bases de datos de anotaciones bioinformáticas
library(org.Hs.eg.db) # Datos de anotación para genes humanos
library(VennDiagram) # Generación de diagramas de Venn 
library(gridExtra) # Mostrar varias gráficas
library(patchwork) # Combinar múltiples ggplots en un único plot
library(ggrepel) # Mejora la visualización de texto en ggplots evitando solapamientos de texto
library(Rtsne) # Implementación de t-SNE
library(umap) # Implementación de UMAP
library(ggVennDiagram) # diagrama de Venn

1 Leyendo datos

Descargar los datos

# covariables
ROSMAP_RINPMIAGESEX_covs <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_covs.rds")
covs <- ROSMAP_RINPMIAGESEX_covs
rownames(covs) <- covs$mrna_id
covs$study <- as.factor(covs$study)
covs$projid <- as.character(covs$projid)
covs$ceradsc <- as.factor(covs$ceradsc)
covs$cogdx <- as.factor(covs$cogdx)
covs$neuroStatus <- as.factor(covs$neuroStatus)

#datos corregidos 
ROSMAP_RINPMIAGESEX_resids <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_resids.rds")
mat_exp <- ROSMAP_RINPMIAGESEX_resids

2 Geneset de alzheimer KEGG

# geneset de Alzheimer extraidos de KEGG

tab <- getGeneKEGGLinks(species="hsa")
tab$Symbol <- mapIds(org.Hs.eg.db, tab$GeneID,
                       column="SYMBOL", keytype="ENTREZID")

paths <- getKEGGPathwayNames(species="hsa")
paths[paths$PathwayID=="hsa05010", ]
geneset_alz <- tab$Symbol[tab$PathwayID=="hsa05010"]

Vamos a seleccionar solamente los genes de Alzheimer de KEGG en nuestro dataset de expresión génica.

mat_exp_alz_genes <- mat_exp[, colnames(mat_exp) %in% geneset_alz]

Visualizamos si se han seleccionado todos los genes y cuantos del geneset de KEGG

venn.plot <- venn.diagram(
  x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset_alz),
  category.names = c("Matrix Genes", "Geneset Alzheimer"),
  filename = NULL,
  output = FALSE,  # Asegura que no se exporte a un archivo
  fill = c("#00CED1", "#E9967A"),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = 25, #posicion de las categoricas
  cat.dist = 0.1, #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = "Alzheimer genes in exp matrix",
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

2.1 PCA

pca_alz_genes_scaled <- prcomp(mat_exp_alz_genes, scale = T)
# Scree plot con los datos escalados
var_exp <- pca_alz_genes_scaled$sdev^2
prop_var_exp <- var_exp / sum(var_exp)
cum_var_exp <- cumsum(prop_var_exp)

df_var_exp <- data.frame(Comp = 1:length(prop_var_exp), VarExp = prop_var_exp)
df_cum_var_exp <- data.frame(Comp = 1:length(cum_var_exp), CumVarExp = cum_var_exp)

ggplot(df_var_exp[1:20,], aes(x = Comp, y = VarExp)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_line(aes(group = 1), color = "blue") +
  geom_point(color = "blue") +
  theme_minimal() +
  labs(x = "Principal components", y = "Variance", title = "Scree Plot scaled") +
  ylim(c(0,1)) +
  geom_line(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color="#8B1A1A") +
  geom_point(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color = "red") +
  geom_bar(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), stat = "identity", fill = "red", alpha= 0.25) +
  annotate("text", x = 4, y = 0.85, label = "Cumulative Scree Plot", color = "#8B1A1A", size = 4) +
  geom_text(data = df_cum_var_exp[seq(0,20,2),], aes(x=Comp, y = CumVarExp +0.04, label = round(CumVarExp, 2)))

Vemos que con las 20 primeras no encontramos ni siquiera el 80% de la varianza.

2.1.1 Seleccionar muestras con el doble de la desviación típica

outlierspc1_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]),1])

outlierspc2_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]),2])
df <- as.data.frame(pca_alz_genes_scaled$x)

plot1 <- ggplot(df, aes(x = PC1)) +
  geom_density(fill = "#00CED1", alpha = 0.5) +
  geom_vline(xintercept = mean(df$PC1) + 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
  geom_vline(xintercept = mean(df$PC1) - 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
  labs(title = "PC1 density with 2*SD scaled PCA") +
  geom_text(data = outlierspc1_scaled, 
            aes(x = outlierspc1_scaled[,1], y= 0, label = rownames(outlierspc1_scaled)), 
            vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
  geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC1, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]), "#8B1A1A", "grey")) +
  theme_minimal()

plot2 <- ggplot(df, aes(x = PC2)) +
  geom_density(fill = "#00CED1", alpha = 0.5) +
  geom_vline(xintercept = mean(df$PC2) + 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
  geom_vline(xintercept = mean(df$PC2) - 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
  labs(title = "PC2 density with 2*SD scaled PCA") +
  geom_text(data = outlierspc2_scaled, 
            aes(x = outlierspc2_scaled[,1], y= 0, label = rownames(outlierspc2_scaled)), 
            vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
  geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC2, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]), "#8B1A1A", "grey")) +
  theme_minimal()

grid.arrange(plot1, plot2, ncol = 1)

mPC1_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1_scaled),]
mPC2_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2_scaled),]
mPC12_scaled <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2_scaled)) & (rownames(mat_exp) %in% rownames(outlierspc1_scaled)),]

not_in_PC12_scaled <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2_scaled)) | (rownames(mat_exp) %in% rownames(outlierspc1_scaled))),]
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1_scaled)), length(rownames(mPC2_scaled)), length(rownames(mPC12_scaled)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1_scaled <- c(rownames(mPC1_scaled), rep("", max_length - length(rownames(mPC1_scaled))))
rownames_mPC2_scaled <- c(rownames(mPC2_scaled), rep("", max_length - length(rownames(mPC2_scaled))))
rownames_mPC12_scaled <- c(rownames(mPC12_scaled), rep("", max_length - length(rownames(mPC12_scaled))))

final_table <- data.frame(
  mPC1 = rownames_mPC1_scaled,
  mPC2 = rownames_mPC2_scaled,
  mPC1_and_PC2 = rownames_mPC12_scaled
)

final_table

Voy ahora a ver de esas muestras seleccionadas, qué covariables están asociadas con esas muestras outliers

2.1.2 Covariables en outliers

rownames(covs) <- covs$mrna_id
covs2 <- covs

for (i in rownames(covs2)){
  if (i %in% rownames(mPC1_scaled) & !(i %in% rownames(mPC12_scaled))){
    covs2[i, "sampleset_PCA"] <- "mPC1"
  }
  if (i %in% rownames(mPC2_scaled) & !(i %in% rownames(mPC12_scaled))){
    covs2[i, "sampleset_PCA"] <- "mPC2"
  }
  if (i %in% rownames(mPC12_scaled)){
    covs2[i, "sampleset_PCA"] <- "mPC1 and mPC2"
  }
  if (!(i %in% rownames(mPC1_scaled)) & !(i %in% rownames(mPC2_scaled))){
    covs2[i, "sampleset_PCA"] <- "Not in both"
  }
}

covs2$sampleset_PCA <- as.factor(covs2$sampleset_PCA)
data.frame(table(covs2$sampleset_PCA))
pca.data.kegg <- data.frame(pca_alz_genes_scaled$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")

covs2 <- merge(covs2, pca.data.kegg, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_PCA) %>%
      summarise(count = n(), .groups = 'drop') %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      filter(sampleset_PCA == "mPC1" | sampleset_PCA == "mPC2" | sampleset_PCA == "mPC1 and mPC2") %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

plots.pca.kegg <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1_scaled) | mrna_id %in% rownames(outlierspc2_scaled), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 10, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.pca.kegg[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.pca.kegg$study, plots.pca.kegg$braaksc, ncol=1)

grid.arrange(plots.pca.kegg$ceradsc, plots.pca.kegg$cogdx, ncol=1)

grid.arrange(plots.pca.kegg$apoe_genotype, plots.pca.kegg$neuroStatus, ncol=1)

2.2 tSNE

Parámetros clave en t-SNE: Perplexity (Perplejidad): Este es uno de los parámetros más importantes en t-SNE. La perplejidad se relaciona con el número de vecinos cercanos que t-SNE considera cuando mapea los datos. Un valor demasiado bajo hace que el modelo se concentre demasiado en la estructura local, mientras que un valor demasiado alto puede llevar a una visión global que pierde detalles. La elección óptima depende del tamaño del dataset, pero valores típicos están entre 5 y 50.

Número de iteraciones: t-SNE es un algoritmo iterativo, y el número de iteraciones puede afectar la estabilidad de los resultados. Un número insuficiente de iteraciones puede resultar en una organización incompleta, mientras que demasiadas iteraciones pueden no proporcionar beneficios adicionales y aumentar el tiempo de cálculo.

Tasa de aprendizaje: Este parámetro controla el tamaño del paso en cada actualización durante la optimización. Una tasa de aprendizaje muy baja puede hacer que el algoritmo tarde mucho en converger, mientras que una tasa demasiado alta puede llevar a una convergencia pobre.

set.seed(1234)
tsne <- Rtsne(mat_exp_alz_genes, dims = 2, theta = 0.0)

tsne.data <- as.data.frame(tsne$Y)
row.names(tsne.data) <- row.names(mat_exp_alz_genes)
tsne.data.covs <- merge(tsne.data, covs, by = "row.names")
tsne.data.covs$Row.names <- NULL
row.names(tsne.data.covs) <- tsne.data.covs$mrna_id

2.2.1 Seleccionamos outliers

rownames(tsne.data) <- rownames(mat_exp_alz_genes)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data[,1])
sd.tsne1 <- sd(tsne.data[,1])
mean.tsne2 <- mean(tsne.data[,2])
sd.tsne2 <- sd(tsne.data[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1 <- tsne.data[abs(tsne.data[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2 <-tsne.data[abs(tsne.data[,2] - mean.tsne2) > 2 * sd.tsne2, ]

mtSNE1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1),]
mtSNE2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2),]
mtSNE12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1)) & (rownames(mat_exp) %in% rownames(outliers.tsne2)),]

not_in_tSNE12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1)) | (rownames(mat_exp) %in% rownames(outliers.tsne2))),]

2.2.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mtSNE1) & !(i %in% rownames(mtSNE12))){
    covs2[i, "sampleset_tSNE"] <- "mtSNE1"
  }
  if (i %in% rownames(mtSNE2) & !(i %in% rownames(mtSNE12))){
    covs2[i, "sampleset_tSNE"] <- "mtSNE2"
  }
  if (i %in% rownames(mtSNE12)){
    covs2[i, "sampleset_tSNE"] <- "mtSNE1 and mtSNE2"
  }
  if (!(i %in% rownames(mtSNE1)) & !(i %in% rownames(mtSNE2))){
    covs2[i, "sampleset_tSNE"] <- "Not in both"
  }
}

covs2$sampleset_tSNE <- as.factor(covs2$sampleset_tSNE)
data.frame(table(covs2$sampleset_tSNE))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1)), length(rownames(mtSNE2)), length(rownames(mtSNE12)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1 <- c(rownames(mtSNE1), rep("", max_length - length(rownames(mtSNE1))))
rownames_tsne2 <- c(rownames(mtSNE2), rep("", max_length - length(rownames(mtSNE2))))
rownames_tsne12 <- c(rownames(mtSNE12), rep("", max_length - length(rownames(mtSNE12))))

final_table <- data.frame(
  mtSNE1 = rownames_tsne1,
  mtSNE2 = rownames_tsne2,
  mtSNE1_and_mtSNE2 = rownames_tsne12
)

final_table
colnames(tsne.data) <- c("tSNE1.KEGG", "tSNE2.KEGG")

covs2 <- merge(covs2, tsne.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_tSNE) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_tSNE == "mtSNE1" | sampleset_tSNE == "mtSNE2" | sampleset_tSNE == "mtSNE1 and mtSNE2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

plots.tsne.kegg <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.tsne1) | mrna_id %in% rownames(outliers.tsne2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 20, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.tsne.kegg[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.tsne.kegg$study, plots.tsne.kegg$braaksc, ncol=1)

grid.arrange(plots.tsne.kegg$ceradsc, plots.tsne.kegg$cogdx, ncol=1)

grid.arrange(plots.tsne.kegg$apoe_genotype, plots.tsne.kegg$neuroStatus, ncol=1)

2.3 UMAP

local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad <- umap(mat_exp_alz_genes,random_stage=1234, local.config)

umap.data <- as.data.frame(umap.ad$layout)
row.names(umap.data) <- row.names(mat_exp_alz_genes)
umap.data.covs <- merge(umap.data, covs2, by = "row.names")
umap.data.covs$Row.names <- NULL
row.names(umap.data.covs) <- umap.data.covs$mrna_id

2.3.1 Seleccionamos outliers

rownames(umap.data) <- rownames(mat_exp_alz_genes)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data[,1])
sd.umap1 <- sd(umap.data[,1])
mean.umap2 <- mean(umap.data[,2])
sd.umap2 <- sd(umap.data[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1 <- umap.data[abs(umap.data[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2 <-umap.data[abs(umap.data[,2] - mean.umap2) > 2 * sd.umap2, ]

mUMAP1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1),]
mUMAP2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2),]
mUMAP12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1)) & (rownames(mat_exp) %in% rownames(outliers.umap2)),]

not_in_UMAP12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1)) | (rownames(mat_exp) %in% rownames(outliers.umap2))),]

2.3.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mUMAP1) & !(i %in% rownames(mUMAP12))){
    covs2[i, "sampleset_UMAP"] <- "mUMAP1"
  }
  if (i %in% rownames(mUMAP2) & !(i %in% rownames(mUMAP12))){
    covs2[i, "sampleset_UMAP"] <- "mUMAP2"
  }
  if (i %in% rownames(mUMAP12)){
    covs2[i, "sampleset_UMAP"] <- "mUMAP1 and mUMAP2"
  }
  if (!(i %in% rownames(mUMAP1)) & !(i %in% rownames(mUMAP2))){
    covs2[i, "sampleset_UMAP"] <- "Not in both"
  }
}

covs2$sampleset_UMAP <- as.factor(covs2$sampleset_UMAP)
data.frame(table(covs2$sampleset_UMAP))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1)), length(rownames(mUMAP2)), length(rownames(mUMAP12)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1 <- c(rownames(mUMAP1), rep("", max_length - length(rownames(mUMAP1))))
rownames_umap2 <- c(rownames(mUMAP2), rep("", max_length - length(rownames(mUMAP2))))
rownames_umap12 <- c(rownames(mUMAP12), rep("", max_length - length(rownames(mUMAP12))))

final_table <- data.frame(
  mUMAP1 = rownames_umap1,
  mUMAP2 = rownames_umap2,
  mUMAP1_and_mUMAP2 = rownames_umap12
)

final_table
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_UMAP) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_UMAP == "mUMAP1" | sampleset_UMAP == "mUMAP2" | sampleset_UMAP == "mUMAP1 and mUMAP2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

colnames(umap.data) <- c("UMAP1.KEGG", "UMAP2.KEGG")

covs2 <- merge(covs2, umap.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.umap.kegg <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.umap1) | mrna_id %in% rownames(outliers.umap2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.umap.kegg[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.umap.kegg$study, plots.umap.kegg$braaksc, ncol=1)

grid.arrange(plots.umap.kegg$ceradsc, plots.umap.kegg$cogdx, ncol=1)

grid.arrange(plots.umap.kegg$apoe_genotype, plots.umap.kegg$neuroStatus, ncol=1)

3 Geneset de AD de Andrews et al 2023 Lancet review

geneset.AD.Andrews <- read_delim("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/geneset AD Andrews 2023 Lancet.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)

geneset.AD.Andrews <- unique(geneset.AD.Andrews$Gene)
mat.expre.AD.Andrews <- data.frame(mat_exp[, colnames(mat_exp) %in% geneset.AD.Andrews])
venn.plot <- venn.diagram(
  x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset.AD.Andrews),
  category.names = c("Matrix Genes", "Geneset Alzheimer"),
  filename = NULL,
  output = FALSE,  # Asegura que no se exporte a un archivo
  fill = c("#00CED1", "#E9967A"),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = 25, #posicion de las categoricas
  cat.dist = 0.1, #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = "Alzheimer genes in exp matrix Andrews",
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

3.1 PCA

pca.AD.Andrews <- prcomp(mat.expre.AD.Andrews, scale = T)

3.1.0.1 Seleccionar muestras con el doble de la desviación típica

Seleccionamos muestras que esten 2 veces la desviacion tipica para la PC1 y PC2

outlierspc1.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,1]) > 2*(pca.AD.Andrews$sdev[1]),1])

outlierspc2.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,2]) > 2*(pca.AD.Andrews$sdev[2]),2])

Las ploteamos

Creamos los grupos de muestras seleccionados de los outliers de PC1 y/o PC2

mPC1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1.Andrews),]
mPC2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2.Andrews),]
mPC12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) & (rownames(mat_exp) %in% rownames(outlierspc1.Andrews)),]

not_in_PC12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) | (rownames(mat_exp) %in% rownames(outlierspc1.Andrews))),]
rownames(covs2) <- covs2$mrna_id

for (i in rownames(covs2)){
  if (i %in% rownames(mPC1.Andrews) & !(i %in% rownames(mPC12.Andrews))){
    covs2[i, "sampleset_PCA_Andrews"] <- "mPC1"
  }
  if (i %in% rownames(mPC2.Andrews) & !(i %in% rownames(mPC12.Andrews))){
    covs2[i, "sampleset_PCA_Andrews"] <- "mPC2"
  }
  if (i %in% rownames(mPC12.Andrews)){
    covs2[i, "sampleset_PCA_Andrews"] <- "mPC1 and mPC2"
  }
  if (!(i %in% rownames(mPC1.Andrews)) & !(i %in% rownames(mPC2.Andrews))){
    covs2[i, "sampleset_PCA_Andrews"] <- "Not in both"
  }
}

covs2$sampleset_PCA_Andrews <- as.factor(covs2$sampleset_PCA_Andrews)
data.frame(table(covs2$sampleset_PCA_Andrews))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1.Andrews)), length(rownames(mPC2.Andrews)), length(rownames(mPC12.Andrews)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1.Andrews <- c(rownames(mPC1.Andrews), rep("", max_length - length(rownames(mPC1.Andrews))))
rownames_mPC2.Andrews <- c(rownames(mPC2.Andrews), rep("", max_length - length(rownames(mPC2.Andrews))))
rownames_mPC12.Andrews <- c(rownames(mPC12.Andrews), rep("", max_length - length(rownames(mPC12.Andrews))))

final_table <- data.frame(
  mPC1 = rownames_mPC1.Andrews,
  mPC2 = rownames_mPC2.Andrews,
  mPC1_and_PC2 = rownames_mPC12.Andrews
)

final_table

3.1.0.2 Covariables en outliers

Metemos una nueva variable en las covariables dependiendo de si son outliers de PC1 o del PC2 o de ninguno.

plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_PCA_Andrews) %>%
      summarise(count = n(), .groups = 'drop') %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      filter(sampleset_PCA_Andrews == "mPC1" | sampleset_PCA_Andrews == "mPC2" | sampleset_PCA_Andrews == "mPC1 and mPC2") %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA_Andrews")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

pca.data.Andrews <- data.frame(pca.AD.Andrews$x)[,1:2]
colnames(pca.data.Andrews) <- c("PCA1.Andrews", "PCA2.Andrews")

covs2 <- merge(covs2, pca.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.pca.Andrews <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1.Andrews) | mrna_id %in% rownames(outlierspc2.Andrews), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 10, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.pca.Andrews[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.pca.Andrews$study, plots.pca.Andrews$braaksc, ncol=1)

grid.arrange(plots.pca.Andrews$ceradsc, plots.pca.Andrews$cogdx, ncol=1)

grid.arrange(plots.pca.Andrews$apoe_genotype, plots.pca.Andrews$neuroStatus, ncol=1)

3.2 tSNE

set.seed(1234)
tsne.Andrews <- Rtsne(mat.expre.AD.Andrews, dims = 2, theta = 0.0)

tsne.data.Andrews <- as.data.frame(tsne.Andrews$Y)
row.names(tsne.data.Andrews) <- row.names(mat.expre.AD.Andrews)
tsne.Andrews.covs <- merge(tsne.data.Andrews, covs, by = "row.names")
tsne.Andrews.covs$Row.names <- NULL
row.names(tsne.Andrews.covs) <- tsne.Andrews.covs$mrna_id

3.2.1 Seleccionamos outliers

rownames(tsne.data.Andrews) <- rownames(mat.expre.AD.Andrews)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data.Andrews[,1])
sd.tsne1 <- sd(tsne.data.Andrews[,1])
mean.tsne2 <- mean(tsne.data.Andrews[,2])
sd.tsne2 <- sd(tsne.data.Andrews[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1.Andrews <- tsne.data.Andrews[abs(tsne.data.Andrews[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2.Andrews <-tsne.data.Andrews[abs(tsne.data.Andrews[,2] - mean.tsne2) > 2 * sd.tsne2, ]

mtSNE1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews),]
mtSNE2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews),]
mtSNE12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews)),]

not_in_tSNE12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews))),]

3.2.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mtSNE1.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
    covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1"
  }
  if (i %in% rownames(mtSNE2.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
    covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE2"
  }
  if (i %in% rownames(mtSNE12.Andrews)){
    covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1 and mtSNE2"
  }
  if (!(i %in% rownames(mtSNE1.Andrews)) & !(i %in% rownames(mtSNE2.Andrews))){
    covs2[i, "sampleset_tSNE_Andrews"] <- "Not in both"
  }
}

covs2$sampleset_tSNE_Andrews <- as.factor(covs2$sampleset_tSNE_Andrews)
data.frame(table(covs2$sampleset_tSNE_Andrews))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1.Andrews)), length(rownames(mtSNE2.Andrews)), length(rownames(mtSNE12.Andrews)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1.Andrews <- c(rownames(mtSNE1.Andrews), rep("", max_length - length(rownames(mtSNE1.Andrews))))
rownames_tsne2.Andrews <- c(rownames(mtSNE2.Andrews), rep("", max_length - length(rownames(mtSNE2.Andrews))))
rownames_tsne12.Andrews <- c(rownames(mtSNE12.Andrews), rep("", max_length - length(rownames(mtSNE12.Andrews))))

final_table <- data.frame(
  mtSNE1 = rownames_tsne1.Andrews,
  mtSNE2 = rownames_tsne2.Andrews,
  mtSNE1_and_mtSNE2 = rownames_tsne12.Andrews
)

final_table
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_tSNE_Andrews) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_tSNE_Andrews == "mtSNE1" | sampleset_tSNE_Andrews == "mtSNE2" | sampleset_tSNE_Andrews == "mtSNE1 and mtSNE2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE_Andrews")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

colnames(tsne.data.Andrews) <- c("tSNE1.Andrews", "tSNE2.Andrews")

covs2 <- merge(covs2, tsne.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.tsne.Andrews <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.tsne1.Andrews) | covs2$mrna_id %in% rownames(outliers.tsne2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.tsne.Andrews[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.tsne.Andrews$study, plots.tsne.Andrews$braaksc, ncol=1)

grid.arrange(plots.tsne.Andrews$ceradsc, plots.tsne.Andrews$cogdx, ncol=1)

grid.arrange(plots.tsne.Andrews$apoe_genotype, plots.tsne.Andrews$neuroStatus, ncol=1)

3.3 UMAP

local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad.Andrews <- umap(mat.expre.AD.Andrews,random_stage=1234, local.config)

umap.data.Andrews <- as.data.frame(umap.ad.Andrews$layout)
row.names(umap.data.Andrews) <- row.names(mat.expre.AD.Andrews)
umap.data.covs.Andrews <- merge(umap.data.Andrews, covs2, by = "row.names")
umap.data.covs.Andrews$Row.names <- NULL
row.names(umap.data.covs.Andrews) <- umap.data.covs.Andrews$mrna_id

3.3.1 Seleccionamos outliers

rownames(umap.data.Andrews) <- rownames(mat.expre.AD.Andrews)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data.Andrews[,1])
sd.umap1 <- sd(umap.data.Andrews[,1])
mean.umap2 <- mean(umap.data.Andrews[,2])
sd.umap2 <- sd(umap.data.Andrews[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1.Andrews <- umap.data.Andrews[abs(umap.data.Andrews[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2.Andrews <-umap.data.Andrews[abs(umap.data.Andrews[,2] - mean.umap2) > 2 * sd.umap2, ]

mUMAP1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1.Andrews),]
mUMAP2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2.Andrews),]
mUMAP12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews)),]

not_in_UMAP12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews))),]

3.3.2 Covariables en outliers

for (i in rownames(covs2)){
  if (i %in% rownames(mUMAP1.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
    covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1"
  }
  if (i %in% rownames(mUMAP2.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
    covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP2"
  }
  if (i %in% rownames(mUMAP12.Andrews)){
    covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1 and mUMAP2"
  }
  if (!(i %in% rownames(mUMAP1.Andrews)) & !(i %in% rownames(mUMAP2.Andrews))){
    covs2[i, "sampleset_UMAP_Andrews"] <- "Not in both"
  }
}

covs2$sampleset_UMAP_Andrews <- as.factor(covs2$sampleset_UMAP_Andrews)
data.frame(table(covs2$sampleset_UMAP_Andrews))
# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1.Andrews)), length(rownames(mUMAP2.Andrews)), length(rownames(mUMAP12.Andrews)))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1.Andrews <- c(rownames(mUMAP1.Andrews), rep("", max_length - length(rownames(mUMAP1.Andrews))))
rownames_umap2.Andrews <- c(rownames(mUMAP2.Andrews), rep("", max_length - length(rownames(mUMAP2.Andrews))))
rownames_umap12.Andrews <- c(rownames(mUMAP12.Andrews), rep("", max_length - length(rownames(mUMAP12.Andrews))))

final_table <- data.frame(
  mUMAP1 = rownames_umap1.Andrews,
  mUMAP2 = rownames_umap2.Andrews,
  mUMAP1_and_mUMAP2 = rownames_umap12.Andrews
)

final_table
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
  col_name <- colnames(covs2)[i]
  
  if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
    next
  }
  if (class(covs2[[i]]) == "factor"){
    p <- covs2 %>%
      group_by(across(all_of(col_name)), sampleset_UMAP_Andrews) %>%
      summarise(count = n(), .groups = 'drop') %>%
      filter(sampleset_UMAP_Andrews == "mUMAP1" | sampleset_UMAP_Andrews == "mUMAP2" | sampleset_UMAP_Andrews == "mUMAP1 and mUMAP2") %>%
      mutate(total = sum(count), percentage = (count / total) * 100) %>%
      ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP_Andrews")) +
      geom_bar(stat="identity", position=position_dodge()) +
      labs(x = NULL, y = "%", fill = col_name) +
      theme_minimal()
    
    plots[[colnames(covs2)[i]]] <- p
  }
}

grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype,  plots$neuroStatus, ncol=2)

colnames(umap.data.Andrews) <- c("UMAP1.Andrews", "UMAP2.Andrews")

covs2 <- merge(covs2, umap.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
plots.umap.Andrews <- list()

# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
  # Crea una gráfica para cada variable
  p <- ggplot(covs2, aes_string(x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i])) +
    geom_point(show.legend = TRUE) +
    geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.umap1.Andrews) | covs2$mrna_id %in% rownames(outliers.umap2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, 
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = names(covs2)[i],
         x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i]) +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm"),
      plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
    )

  plots.umap.Andrews[[colnames(covs2)[i]]] <- p
}

grid.arrange(plots.umap.Andrews$study, plots.umap.Andrews$braaksc, ncol=1)

grid.arrange(plots.umap.Andrews$ceradsc, plots.umap.Andrews$cogdx, ncol=1)

grid.arrange(plots.umap.Andrews$apoe_genotype, plots.umap.Andrews$neuroStatus, ncol=1)

4 Comparaciones

4.1 Entre aproximaciones

grid.arrange(plots.pca.kegg$neuroStatus, plots.pca.Andrews$neuroStatus,plots.tsne.kegg$neuroStatus, plots.tsne.Andrews$neuroStatus, plots.umap.kegg$neuroStatus, plots.umap.Andrews$neuroStatus,ncol = 2)

4.2 Muestras entre técnicas

all.outliers <- list(PCA.KEGG = c(rownames(outlierspc1_scaled), rownames(outlierspc2_scaled)),
                     tSNE.KEGG = c(rownames(outliers.tsne1), rownames(outliers.tsne2)),
                     UMAP.KEGG = c(rownames(outliers.umap1), rownames(outliers.umap2)),
                     PCA.Andrews = c(rownames(outlierspc1.Andrews), rownames(outlierspc2.Andrews)),
                     tSNE.Andrews = c(rownames(outliers.tsne1.Andrews), rownames(outliers.tsne2.Andrews)),
                     UMAP.Andrews = c(rownames(outliers.umap1.Andrews), rownames(outliers.umap2.Andrews))
)

max_length <- max(sapply(all.outliers, length))

# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
  length(x) <- max_length  # Esto rellenará con NA si la lista es más corta que max_length
  x
}

# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)

# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)

all.outliers
# Convertimos el data frame a un vector para hacer la tabla
all_values.outliers <- unlist(all.outliers, use.names = FALSE)

# Usamos table() para contar las frecuencias de cada muestra
table_values <- table(all_values.outliers)

# Filtramos para mostrar solo las muestras que se repiten
repeated_samples <- names(table_values[table_values > 1])

# Unlist with names to track the original column of each value
all_values_named <- unlist(all.outliers, use.names = TRUE)

# Extract unique non-NA samples
unique_samples <- unique(all_values_named[!is.na(all_values_named)])

# Create an empty matrix to fill with presence (1) or absence (0)
incidence_matrix <- matrix(0, nrow = length(unique_samples), ncol = length(all.outliers))
rownames(incidence_matrix) <- unique_samples
colnames(incidence_matrix) <- names(all.outliers)

# Populate the incidence matrix
for (col in names(all.outliers)) {
  present_samples <- all.outliers[[col]][!is.na(all.outliers[[col]])]  # Extract non-NA samples from column
  incidence_matrix[rownames(incidence_matrix) %in% present_samples, col] <- 1
}

# Melt the incidence matrix for ggplot
melted_incidence_matrix <- melt(incidence_matrix, varnames = c("sample", "column"), value.name = "presence")

colnames(melted_incidence_matrix)[3] <- "presence"

# Convertir la matriz de incidencia en un data frame
incidence_df <- as.data.frame(incidence_matrix)
# Crear un data frame para contar y listar las columnas de aparición
summary_df <- data.frame(
  sample = rownames(incidence_df),
  count = rowSums(incidence_df),
  columns = apply(incidence_df, 1, function(row) paste(names(df)[as.logical(row)], collapse = ", "))
)

# Ordenar el summary_df para mejorar la visualización
summary_df <- summary_df %>% 
  arrange(desc(count)) %>% 
  mutate(sample = factor(sample, levels = sample))

# Crear el gráfico de barras
ggplot(summary_df, aes(x = sample, y = count, fill = count)) +  # Mostrar solo las primeras 20 muestras para claridad
  geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
  coord_flip() +  # Invertir el gráfico para mejor visualización de etiquetas
  labs(x = "Sample", y = "Count of Appearances", title = "Sample Appearances Across dimensions") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title.x = element_text(size = 12, face = "bold"),
        axis.title.y = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5))

# Calcular el número total de presencias para cada muestra
total_presences <- rowSums(incidence_matrix)
melted_incidence_matrix$total_presence <- total_presences[as.character(melted_incidence_matrix$sample)]

melted_incidence_matrix$column <- gsub(pattern = "outliers.", replacement = "", x = melted_incidence_matrix$column)

melted_incidence_matrix$sample <- gsub(pattern = "_.*", replacement = "", x = melted_incidence_matrix$sample)

filtered_data <- melted_incidence_matrix %>% 
  filter(presence == 1) %>%
  arrange(desc(total_presence)) %>%
  mutate(sample = factor(sample, levels = unique(sample)))

ggplot(filtered_data, aes(x = column, y = sample, size = total_presence)) +
  geom_point(color = "steelblue") +
  scale_size(range = c(2, 6), guide = "none") +  # Adjustable size scale with no size legend
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right") +
  labs(y = "Sample", x= "Dimension", title = "Presence of Samples Across Dimensions", size = "Total Presence")

4.3 Diagrama de Venn

lista.outliers <- lapply(all.outliers, function(x) unique(na.omit(x)))
venn.plot <- venn.diagram(
  x = lista.outliers[1:3],
  category.names = c("PCA", "tSNE", "UMAP"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#21908dff', '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(10, -10, 0), #posicion de las categoricas
  cat.dist = c(0.3, 0.35, 0.28), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn KEGG geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"), 
          x = 0.5, 
          y = 0.3, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.7, 
          y = 0.3, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_BC <- Reduce(intersect, lista.outliers[2:3])
grid.text(label = paste(gsub("_.*","",interseccion_BC[!(interseccion_BC %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.32, 
          y = 0.35, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[4:6],
  category.names = c("PCA", "tSNE", "UMAP"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#21908dff', '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(10, -10, 0), #posicion de las categoricas
  cat.dist = c(0.35, 0.1, 0.3), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn Andrews geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_ABC <- Reduce(intersect, lista.outliers[4:6])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"), 
          x = 0.58, 
          y = 0.35, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_AC <- Reduce(intersect, lista.outliers[c(4,6)])
grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.42, 
          y = 0.35, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

interseccion_BC <- Reduce(intersect, lista.outliers[5:6])
grid.text(label = paste(gsub("_.*","",interseccion_BC[!(interseccion_BC %in% interseccion_ABC)]), collapse = "\n"), 
          x = 0.68, 
          y = 0.4, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[c(1,4)],
  category.names = c("KEGG", "Andrews"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn PCA geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_AB <- Reduce(intersect, lista.outliers[c(1,4)])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"), 
          x = 0.58, 
          y = 0.4, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[c(2,5)],
  category.names = c("KEGG", "Andrews"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn tSNE geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_AB <- Reduce(intersect, lista.outliers[c(2,5)])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"), 
          x = 0.63, 
          y = 0.4, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))

venn.plot <- venn.diagram(
  x = lista.outliers[c(3,6)],
  category.names = c("KEGG", "Andrews"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0), #posicion de las categoricas
  cat.dist = c(0.1, 0.1), #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn UMAP geneset"),
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 0.9)
)

interseccion_AB <- Reduce(intersect, lista.outliers[c(3,6)])
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"), 
          x = 0.52, 
          y = 0.3, 
          gp = gpar(fontsize = 8, 
                    col = "black",
                    fontface = "italic"))